library(tidyverse) # for data manipulation and plotting
library(rvest) # for webscraping
# ------------------------------------------------------------------------------
# set up for plotting
# ------------------------------------------------------------------------------
theme_set(theme_minimal())
# colour palette
highlight_colour <- "#35469D"
For publishing on github: https://liuyanguu.github.io/post/2021/01/06/rmarkdown-to-github-pages/
key points - 2 tours in separate repos - For each tour each years match results are in a separate csv - objective: create a dataframe of results for from alls years for each tour
#' Get WTA or ATP tour match results from the Tennis Abstract github for the specified year (https://github.com/JeffSackmann)
#'
#' @param year an integer. The earliest years where there are data: wta 1920; atp 1968.
#' @param tour a string either 'wta' or 'atp'
#'
#' @return a dataframe of match results
get_res <- function(year, tour){
# form file location string based on year and tour
repo_url <- glue::glue("https://raw.githubusercontent.com/JeffSackmann/tennis_{tour}/master/")
file_name <- glue::glue("{tour}_matches_{year}.csv")
file_location <- str_c(repo_url, file_name)
#browser()
# read in data from csv on github
res_year <- read_csv(file_location,
# address issue with column type being read differently
# in different files
col_types = list(winner_seed = col_double(),
loser_seed = col_double(),
draw_size = col_double())) %>%
# make variable names consistent
janitor::clean_names() %>%
# add identifiers
mutate(tour = tour,
year = year)
return(res_year)
}
# ------------------------------------------------------------------------------
# download results
# ------------------------------------------------------------------------------
start_year <- 1968
end_year <- 2022
# aggregate atp results into a single dataframe
atp_results <- seq(start_year, end_year) %>%
map_df(~get_res(., "atp"))
# aggregate wta results into a single dataframe
wta_results <- seq(start_year, end_year) %>%
map_df(~get_res(., "wta"))
Key points: - observations are matches - the players identified as
values of the following variables winner_id,
winner_name, loser_id,
loser_name. - Single variables split across two columns
(e.g. winner_age and loser_age actual a single
variable age) winner_ioc and
loser_ioc - Move to observations being a single player’s
match results
#' Get observations relating to either match winners or losers from
#' tennis abstract match raw result data
#'
#' @param raw_results_df a dataframe of tennis abstract match result raw data
#' @param result specifies whether `winner` or `loser`data should be returned
#'
#' @return a dataframe of match data for either winners or losers
get_data_by_result <- function(raw_results_df, result){
res_to_retain <- result
if(res_to_retain == "winner"){
res_to_retain_suffix <- "w_"
res_to_drop <- "loser"
res_to_drop_suffix <- "l_"
}
else if (res_to_retain == "loser"){
res_to_retain_suffix <- "l_"
res_to_drop <- "winner"
res_to_drop_suffix <- "w_"
}
# tidy up data for match losers
raw_results_df %>%
# drop columns containing match stats of the winner
select(-c(starts_with(res_to_drop), starts_with(res_to_drop_suffix))) %>%
# simplify column naming to enable row binding of winners and losers
rename_with(.fn = ~str_replace( .x, str_c(res_to_retain, "_"), ""),
.cols = starts_with(res_to_retain)) %>%
rename_with(.fn = ~str_replace( .x, res_to_retain_suffix, ""),
.cols = starts_with(res_to_retain_suffix)) %>%
# add result column and match_id
mutate(result = res_to_retain,
match_id = 1:n(),
match_id_str = str_c(match_id, tour, year, sep = "_"),
.after = name)
}
#' Puts tennis abstract match raw result data in a tidy form
#'
#' @param raw_results_df a dataframe of tennis abstract match result raw data
#'
#' @return a tidy dataframe of tennis abstract match results data
tidy_match_results <- function(raw_results_df){
losers <- get_data_by_result(raw_results_df, "loser")
winners <- get_data_by_result(raw_results_df, "winner")
results_tidy <-
# recombine data on winning and losing players
bind_rows(losers, winners) %>%
# order for readability
arrange(year, tour, match_id) %>%
# convert variable types where needed
mutate(tourney_date = lubridate::ymd(tourney_date))
return(results_tidy)
}
# ------------------------------------------------------------------------------
# tidy results
# ------------------------------------------------------------------------------
atp_results_tidy <- tidy_match_results(atp_results)
wta_results_tidy <- tidy_match_results(wta_results)
# create single dataframe with results from both tours
all_results_tidy <- bind_rows(
atp_results_tidy, wta_results_tidy
)
results_focused <- all_results_tidy %>%
# focus on entrants to grand slam tournaments
filter(tourney_level == "G") %>%
# focus on variables of interest
select(year, tourney_name, tour, name, id,
country_code = ioc) %>%
# remove duplicates where players have appeared in multiple rounds
unique()
results_focused
# look for missing data in post 1968 results
results_focused %>%
count(year, tourney_name) %>%
filter(n != 256)
# focus on Grand Slams from 1990 onwards
results_clean <- results_focused %>%
filter(year >= 1990) %>%
mutate(tourney_name = replace(tourney_name,
tourney_name == "Us Open",
"US Open"))
# confirm there is no missing data for Grand Slams 1990 onwards
num_missing <- results_clean %>%
count(year, tourney_name) %>%
filter(n != 256) %>%
nrow()
assertthat::assert_that(num_missing == 0)
## [1] TRUE
Source: Our World in Data
gdp_focus <- read_csv("data/gdp-per-capita-worldbank.csv") %>%
# make variable names consistent
janitor::clean_names() %>%
# rename variables to make them easier to work with
rename(gdp_per_capita = gdp_per_capita_ppp_constant_2017_international,
country = entity,
country_code = code) %>%
# focus on 1990 onwards in line with Grand Slam entrant data
filter(year >= 1990)
# remove entities which are not countries (e.g. regions or continents)
# i.e. those which do not have country codes
non_country_entities <- gdp_focus %>%
filter(is.na(country_code)) %>%
distinct(country) %>%
.$country
gdp_focus <- gdp_focus %>%
filter(!country %in% non_country_entities) %>%
filter(country != "World")
look for explicitly missing data and implicitly missing data
# -----------------------------------------------------------------------------
# explore explicitly missing data
# -----------------------------------------------------------------------------
visdat::vis_miss(gdp_focus) +
labs(title = "No explicitly missing data")
# -----------------------------------------------------------------------------
# explore implicitly missing data
# -----------------------------------------------------------------------------
# create a dataframe with all combinations of year and country
gdp_focus_exp_grid <- expand_grid(country = unique(gdp_focus$country),
year = unique(gdp_focus$year)) %>%
# add in gdp data where available
left_join(gdp_focus)
# show implicitly missing data
gdp_missing <- gdp_focus_exp_grid %>%
filter(is.na(gdp_per_capita))
gdp_missing
visdat::vis_miss(gdp_focus_exp_grid) +
labs(title = "But there is implicitly missing data")
### exploring and imputing missing data
# look at countries with missing data
gdp_missing %>%
count(country)
countries_gdp_missing <- unique(gdp_missing$country)
# look at where the missing data is in each country's time series
gdp_missing %>%
arrange(country, year)
p <- gdp_focus_exp_grid %>%
# only need to plot countries with missing data
filter(country %in% countries_gdp_missing) %>%
# create the plot
ggplot(aes(year, gdp_per_capita, group = country)) +
geom_line(alpha = 0.8, colour = highlight_colour)
# make interactive with tooltips for data exploration
plotly::ggplotly(p)
Given the pattern of the missing data get strange behavior trying to use an interpolation function
library(zoo)
strange_interpolation <- c("Kosovo", "Cayman Islands", "Iceland")
p <- gdp_focus_exp_grid %>%
mutate(
gdp_per_capita = zoo::na.approx(gdp_per_capita),
highlight = country %in% strange_interpolation) %>%
# only need to plot countries with missing data
filter(country %in% countries_gdp_missing) %>%
# create the plot
ggplot(aes(year, gdp_per_capita,
group = country,
colour = highlight)) +
geom_line(alpha = 0.8) +
scale_colour_manual(values = c("#C7C2C4", highlight_colour)) +
theme(legend.position = "none")
# make interactive with tooltips for data exploration
plotly::ggplotly(p)
So use a simple method, based on the pattern of missing data
# -----------------------------------------------------------------------------
# find the earliest record of gdp_per_capita for each country
# -----------------------------------------------------------------------------
earliest_gdp <- gdp_focus_exp_grid %>%
# focus on relevant observations
filter(!is.na(gdp_per_capita),
country %in% countries_gdp_missing) %>%
# identify earliest year with gdp_per_cap observation
group_by(country) %>%
mutate(min_year = min(year)) %>%
ungroup() %>%
# focus only on each countries earliest year with gdp_per_cap observation
filter(year == min_year) %>%
# retain only the columns needed to simplify the join below
select(earliest_gdp_per_capita = gdp_per_capita,
country_code,
country)
# -----------------------------------------------------------------------------
# for each country replace missing gdp_per_capita data with earliest
# gdp_per_cap observation
# -----------------------------------------------------------------------------
gdp_clean <- gdp_focus_exp_grid %>%
left_join(earliest_gdp, by = c("country" = "country")) %>%
# replace nas
mutate(gdp_per_capita =
if_else(
is.na(gdp_per_capita),
earliest_gdp_per_capita,
gdp_per_capita
),
country_code =
if_else(
is.na(country_code.x),
country_code.y,
country_code.x
)
) %>%
# tidy up dataframe
arrange(country,year) %>%
select(-c(country_code.x, country_code.y, earliest_gdp_per_capita))
#' Quickly check how well two dataframes join
#'
#' @param df1 the dataframe passed as the first argument to the
#' @param df2
#' @param join_func the specific join function to be applied (e.g. left_join)
#' @param ... other parameters to be passed to the join function
#'
#' @return a vector including the number of observations which are of join successes and failures
join_performance <- function(df1, df2, join_func, ...){
# perform the join
join <- df1 %>%
join_func(df2, ...)
# the number of observations which are of join failures
num_join_miss <- join %>%
filter(is.na(gdp_per_capita)) %>%
distinct(country_code) %>%
nrow()
# the number of observations which are of join successes
num_join_hit <- join %>%
filter(!is.na(gdp_per_capita)) %>%
distinct(country_code) %>%
nrow()
# return the number of observations which are of join successes and failures
c("join_sucesses" = num_join_hit, "join_failures" = num_join_miss)
}
#-------------------------------------------------------------------------------
# prepare the two data frames for joining
#-------------------------------------------------------------------------------
# gdp data only goes up to 2020, so remove 2021 and 2022 match data
results_clean <- filter(results_clean, year <= 2020) %>%
# remove any white space ahead of using to join with gdp data
mutate(country_code = str_trim(country_code, side = "both"))
# remove any white space ahead of using to join with tennis data
gdp_clean <- gdp_clean %>%
mutate(country_code = str_trim(country_code, side = "both"))
#-------------------------------------------------------------------------------
# look at initial join performance
#-------------------------------------------------------------------------------
join_performance(results_clean,
gdp_clean,left_join)
## join_sucesses join_failures
## 55 35
#-------------------------------------------------------------------------------
# web scrape a look up table for mapping between iso country codes (gdp data)
# and ioc codes (tennis data)
#-------------------------------------------------------------------------------
lookup_raw <- read_html("https://simple.wikipedia.org/wiki/Comparison_of_IOC,_FIFA,_and_ISO_3166_country_codes") %>%
html_nodes(xpath="/html/body/div[3]/div[3]/div[5]/div[1]/table[1]") %>%
html_table() %>%
pluck(1)
iso_ioc_lookup <- lookup_raw %>%
janitor::clean_names() %>%
select(ioc, iso)
#-------------------------------------------------------------------------------
# replace iso country codes in gdp data with ioc codes (as used in tennis data)
#-------------------------------------------------------------------------------
gdp_clean_ioc <- gdp_clean %>%
left_join(iso_ioc_lookup,
by = c("country_code" = "iso"))
# just one country without an ioc code so ignore
gdp_clean_ioc %>%
filter(is.na(ioc))
gdp_clean_ioc <- gdp_clean_ioc %>%
mutate(country_code = ioc) %>%
select(-ioc)
#-------------------------------------------------------------------------------
# check if join performance has improved
#-------------------------------------------------------------------------------
join_performance(results_clean,
gdp_clean_ioc,left_join)
## join_sucesses join_failures
## 81 9
# look at players/countries where no match to gdp data
no_match <- results_clean %>%
left_join(gdp_clean_ioc) %>%
filter(is.na(gdp_per_capita))
no_match %>%
distinct(year, name, country_code)
no_match %>%
distinct(country_code)
unique(gdp_clean_ioc$country_code)
## [1] "AFG" "ALB" "ALG" "ANG" "ANT" "ARG" "ARM" "ARU" "AUS" "AUT" "AZE" "BAH"
## [13] "BRN" "BAN" "BAR" "BLR" "BEL" "BIZ" "BEN" "BER" "BHU" "BOL" "BIH" "BOT"
## [25] "BRA" "BRU" "BUL" "BUR" "BDI" "CAM" "CMR" "CAN" "CPV" "CAY" "CAF" "CHA"
## [37] "CHI" "CHN" "COL" "COM" "CGO" "CRC" "CIV" "CRO" "" "CYP" "CZE" "COD"
## [49] "DEN" "DJI" "DMA" "DOM" "ECU" "EGY" "ESA" "GEQ" "EST" "SWZ" "ETH" "FIJ"
## [61] "FIN" "FRA" "GAB" "GAM" "GEO" "GER" "GHA" "GRE" "GRN" "GUA" "GUI" "GBS"
## [73] "GUY" "HAI" "HON" "HKG" "HUN" "ISL" "IND" "INA" "IRI" "IRQ" "IRL" "ISR"
## [85] "ITA" "JAM" "JPN" "JOR" "KAZ" "KEN" "KIR" NA "KUW" "KGZ" "LAO" "LAT"
## [97] "LIB" "LES" "LBR" "LBA" "LTU" "LUX" "MAD" "MAW" "MAS" "MDV" "MLI" "MLT"
## [109] "MHL" "MTN" "MRI" "MEX" "FSM" "MDA" "MGL" "MNE" "MAR" "MOZ" "MYA" "NAM"
## [121] "NRU" "NEP" "NED" "NZL" "NCA" "NIG" "NGR" "MKD" "NOR" "OMA" "PAK" "PLW"
## [133] "PLE" "PAN" "PNG" "PAR" "PER" "PHI" "POL" "POR" "PUR" "QAT" "ROU" "RUS"
## [145] "RWA" "SKN" "LCA" "VIN" "SAM" "SMR" "STP" "KSA" "SEN" "SRB" "SEY" "SLE"
## [157] "SGP" "SVK" "SLO" "SOL" "SOM" "RSA" "KOR" "ESP" "SRI" "SUD" "SUR" "SWE"
## [169] "SUI" "TJK" "TAN" "THA" "TLS" "TOG" "TGA" "TTO" "TUN" "TUR" "TKM" "TUV"
## [181] "UGA" "UKR" "UAE" "GBR" "USA" "URU" "UZB" "VAN" "VIE" "ZAM" "ZIM"
# -----------------------------------------------------------------------------
# Identify predecessor countries and check for the names of countries with
# no gdp data
# -----------------------------------------------------------------------------
# set up string for searching within country names
country_check <- c("ven" = c("venezuela"),
"yug" = c("yugoslavia"),
"cub" = c("cuba"),
"mon" = c("monaco"),
"tpe" = c("taiwan"),
"tch" = c("czechoslovakia", "cz"),
"urs" = c("soviet union", "ussr", "russia"),
"scg" = c("serbia", "montenegro"),
"lie" = c("liechtenstein", "liech", "germany"))
# combine country search strings into a single regular expression
# | is OR
country_check_str <- str_c(country_check, collapse = "|")
# search strings found in the following countries present in the dataset
matches_found <- gdp_clean_ioc %>%
mutate(country_check = str_detect(str_to_lower(country), country_check_str)) %>%
filter(country_check) %>%
distinct(country, country_code)
matches_found
# -----------------------------------------------------------------------------
# Where possible find a country that approximately matches for countries with
# no gdp data
# -----------------------------------------------------------------------------
country_code_replacement_lookup <- matches_found %>%
mutate(code_to_replace = c("TCH", "LIE", NA, "URS", "SCG" )) %>%
rename(replacement_code = country_code) %>%
select(-country) %>%
filter(!is.na(code_to_replace))
results_clean <- results_clean %>%
left_join(country_code_replacement_lookup,
by = c("country_code" = "code_to_replace")) %>%
mutate(country_code = if_else(!is.na(replacement_code),
replacement_code,
country_code)) %>%
select(-replacement_code)
#-------------------------------------------------------------------------------
# check if join performance has improved
#-------------------------------------------------------------------------------
join_performance(results_clean,
gdp_clean_ioc,left_join)
## join_sucesses join_failures
## 81 5
# good enough
results_gdp <- left_join(results_clean, gdp_clean_ioc)
results_gdp %>%
relocate(gdp_per_capita, country) %>%
visdat::vis_miss()
# -----------------------------------------------------------------------------
# calculate number of first round grand slam entrants for each country each
# year
# -----------------------------------------------------------------------------
num_gs_entrants <- results_gdp %>%
group_by(year, country) %>%
mutate(num_first_rd = n()) %>%
ungroup() %>%
select(-c(id, name, tour, tourney_name)) %>%
distinct() %>%
arrange(year, desc(num_first_rd))
num_gs_entrants
# num of countries represented in first round of grand slam singles
num_countries <- num_gs_entrants %>%
count(year) %>%
rename(num_countries_represented = n)
# -----------------------------------------------------------------------------
# create exploratory plots
# -----------------------------------------------------------------------------
ggplot(num_countries, aes(year, num_countries_represented)) +
geom_col()
p <- ggplot(num_gs_entrants, aes(year, num_first_rd, group = country)) +
geom_line(alpha = 0.5)
plotly::ggplotly(p) # make interactive to explore which line is which country
# calculated country income deciles for each country
# use gdp data (all countries) rather than tennis data (subset of countries)
income_deciles <- gdp_clean_ioc %>%
group_by(year) %>%
mutate(income_decile = ntile(gdp_per_capita, 10)) %>%
ungroup() %>%
select(-c(country, gdp_per_capita))
# add the newly calculated deciles into the tennis/gdp dataframe
num_gs_entrants_inc_dec <- num_gs_entrants %>%
left_join(income_deciles)
#' Plot the percentage of Grand Slam First Round appearances made by players
#' from countries in the top n percentage of highest income
#'
#' @param df a dataframe containing the grand slam first round appearance data
#' @param decile integer between 1 and 10
plot_top_n_perc <- function(df, decile){
# create a variable names and plot title based on the decile chosen
top_n_perc <- (10 - decile + 1) * 10
var_str <- str_c("top_", top_n_perc, "_perc")
plot_title <- str_c("The top ", top_n_perc, "% highest income countries\n")
# test if each observation is for specified decile or a higher decile
df[, var_str] <-
df$income_decile >= decile
# calculate number of first round appearance for top n % of countries
df_plot <- df %>%
na.omit() %>%
group_by(year, !!sym(var_str)) %>%
summarise(num_first_rd = sum(num_first_rd)) %>%
ungroup() %>%
group_by(year) %>%
mutate(perc_first_rd = num_first_rd / sum(num_first_rd) * 100) %>%
ungroup() %>%
filter(!!sym(var_str))
# create the plot itself
df_plot %>%
ggplot(aes(year, perc_first_rd)) +
geom_line(colour = "#374E83") +
# tidy up labelling of the plot
labs(x = NULL, y = NULL,
# put y axis label at the top of the y axis
subtitle = "Percentage of first round\nGrand Slam appearances\n",
# identify the top n % for the plot
title = plot_title) +
# tidy up axis
scale_y_continuous(labels = scales::label_percent(scale = 1),
limits = c(0,100)) +
scale_x_continuous(breaks = seq(1990, 2020, 5)) +
coord_cartesian(expand = FALSE, clip = "off") +
# tidy positioning of text and gridlines
theme(panel.grid.minor.x = element_blank(),
plot.title.position = "plot",
plot.margin = margin(t = 20, r = 30, b = 20, l = 20, unit = "pt"))
}
#-------------------------------------------------------------------------------
# create exploratory plots for the top n percent of highest income countries
#-------------------------------------------------------------------------------
seq(10,1) %>%
map(~plot_top_n_perc(num_gs_entrants_inc_dec, .))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
# -----------------------------------------------------------------------------
# calculate number of first round entrants for each grand slam for each country
# each year
# -----------------------------------------------------------------------------
num_gs_tourney_entrants <- results_gdp %>%
group_by(year, tourney_name, country) %>%
mutate(num_first_rd = n()) %>%
ungroup() %>%
select(-c(id, name, tour)) %>%
distinct() %>%
arrange(year, tourney_name, desc(num_first_rd)) %>%
# add income deciles
left_join(income_deciles) %>%
# test if observation relates to a top 20% income country
mutate(top_20_perc = income_decile >= 9)
num_gs_tourney_entrants
# -----------------------------------------------------------------------------
# create main data layer for the plot
# -----------------------------------------------------------------------------
# calculate number of first round appearance for top n % of countries
df_plot <- num_gs_tourney_entrants %>%
na.omit() %>%
group_by(year, tourney_name, top_20_perc) %>%
summarise(num_first_rd = sum(num_first_rd)) %>%
ungroup() %>%
group_by(year, tourney_name) %>%
mutate(perc_first_rd = num_first_rd / sum(num_first_rd) * 100) %>%
ungroup() %>%
filter(top_20_perc)
# -----------------------------------------------------------------------------
# create additional data layers for the plot
# -----------------------------------------------------------------------------
annotation_layer <- df_plot %>%
group_by(tourney_name) %>%
mutate(min_perc_first_rd = min(perc_first_rd),
min_year = min(year),
max_year = max(year)) %>%
ungroup() %>%
filter(min_perc_first_rd == perc_first_rd |
year == min_year |
year == max_year)
# -----------------------------------------------------------------------------
# create the plot itself
# -----------------------------------------------------------------------------
p <- df_plot %>%
ggplot(aes(year,
perc_first_rd,
colour = tourney_name)) +
geom_line() +
geom_point(data = annotation_layer) +
ggrepel::geom_text_repel(data = annotation_layer,
mapping = aes(label = round(perc_first_rd))) +
# tidy up labelling of the plot
labs(x = NULL, y = NULL,
# put y axis label at the top of the y axis
subtitle = "Percentage of first round\nGrand Slam appearances\n",
# identify the top n % for the plot
title = "plot_title") +
scale_colour_manual(values = c("#6A8EAF", "#D97A53",
"#35469D", "#216E4B")) +
# tidy up axis
scale_y_continuous(labels = scales::label_percent(scale = 1),
limits = c(50,100)) +
scale_x_continuous(breaks = seq(1990, 2020, 5)) +
coord_cartesian(expand = FALSE, clip = "off") +
# tidy positioning of text and gridlines
theme(panel.grid.minor = element_blank(),
panel.spacing.x = unit(2, "cm"),
panel.spacing.y = unit(1, "cm"),
plot.title.position = "plot",
#plot.margin = margin(t = 20, r = 30, b = 20, l = 20, unit = "pt"),
legend.position = "none",
strip.text = element_text(margin = margin(10,5,10,5,"pt")),
plot.margin = unit(c(2,2,2,2), 'cm')) +
facet_wrap(~tourney_name)
p
ggsave("images/perc_top_20.svg", width = 297, height = 210, units = "mm")